# install.packages("tidyverse")
# install.packages("readxl")
# install.packages("psych")
# install.packages("gridExtra")
library(psych)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
library(gridExtra)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x ggplot2::%+%() masks psych::%+%()
## x ggplot2::alpha() masks scales::alpha(), psych::alpha()
## x readr::col_factor() masks scales::col_factor()
## x dplyr::combine() masks gridExtra::combine()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(knitr)
Read data
CallCentreUrl <- "data/callcentre_06.xlsx"
HRMUrl <- "data/HRM_06.xlsx"
CallCentreData <- read_excel(CallCentreUrl)
HRMData <- read_excel(HRMUrl)
CallCentre data description
str(CallCentreData)
## tibble [150,937 × 7] (S3: tbl_df/tbl/data.frame)
## $ time : num [1:150937] 2 3 3 3 3 3 3 3 3 3 ...
## $ length : num [1:150937] 227 288 189 206 261 166 231 209 200 255 ...
## $ forwarded : chr [1:150937] "NA" "NA" "NA" "NA" ...
## $ customer_satisfaction: num [1:150937] 10 7 7 9 NA 10 NA NA 8 9 ...
## $ waiting : num [1:150937] 24 0 54 282 151 10 26 0 212 56 ...
## $ problem : chr [1:150937] "delivery" "delivery" "delivery" "delivery" ...
## $ agent : num [1:150937] 58 19 53 95 113 16 61 62 27 95 ...
CallCentre data summary
summary(CallCentreData)
## time length forwarded customer_satisfaction
## Min. : 2.0 Min. : 5.0 Length:150937 Min. : 1.0
## 1st Qu.: 9.0 1st Qu.: 198.0 Class :character 1st Qu.: 7.0
## Median :12.0 Median : 246.0 Mode :character Median : 9.0
## Mean :13.5 Mean : 311.1 Mean : 8.3
## 3rd Qu.:18.0 3rd Qu.: 399.0 3rd Qu.:10.0
## Max. :24.0 Max. :1332.0 Max. :10.0
## NA's :42461
## waiting problem agent
## Min. : 0.00 Length:150937 Min. : 1.00
## 1st Qu.: 0.00 Class :character 1st Qu.: 31.00
## Median : 29.00 Mode :character Median : 58.00
## Mean : 52.93 Mean : 59.36
## 3rd Qu.: 90.00 3rd Qu.: 88.00
## Max. :561.00 Max. :122.00
##
HRM data description
str(HRMData)
## tibble [120 × 6] (S3: tbl_df/tbl/data.frame)
## $ gender : chr [1:120] "male" "female" "male" "male" ...
## $ tenure : num [1:120] 10 51 50 21 5 61 70 7 45 50 ...
## $ age : num [1:120] 19 22 23 23 24 24 24 24 25 25 ...
## $ qualification: chr [1:120] "some college" "apprenticeship" "apprenticeship" "apprenticeship" ...
## $ ethnicity : chr [1:120] "Black" "British" "Black" "Black" ...
## $ agent : chr [1:120] "109" "65" "67" "92" ...
HRM data summary
summary(HRMData)
## gender tenure age qualification
## Length:120 Min. : 0.00 Min. :19.00 Length:120
## Class :character 1st Qu.: 26.25 1st Qu.:27.00 Class :character
## Mode :character Median : 54.50 Median :29.00 Mode :character
## Mean : 65.47 Mean :28.88
## 3rd Qu.: 91.50 3rd Qu.:31.00
## Max. :229.00 Max. :38.00
## ethnicity agent
## Length:120 Length:120
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Number of call by time
CallCentreData %>% count(time)
options(scipen = 3)
CallCentreData %>%
count(time) %>%
mutate(percentage = n / sum(n) * 100)
CallCentreData %>% count(time) %>%
ggplot(. , aes(x = time, y = n )) +
geom_bar( stat="identity", position=position_dodge(),
fill="#4126de", color="#e9ecef", alpha=0.9) +
ylab("Number of call") +
xlab("Time of the day (in hours)") +
geom_text(aes(label=n),
vjust= - 0.5,
position = position_dodge(0.9),
size=2.5) +
ggtitle("Number Of Call By Time") +
theme_minimal()
Average duration of the call by time
CallCentreData %>% group_by(time) %>% summarise(mean = mean(length, na.rm = TRUE))
CallCentreData %>% group_by(time) %>% summarise(mean = round(mean(length, na.rm = TRUE))) %>%
ggplot(. , aes(x = time, y = mean )) +
geom_bar( stat="identity", position=position_dodge(),
fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ylab("Average length of time") +
xlab("Time of the day (in hours)") +
geom_text(aes(label= mean),
vjust= - 0.5,
position = position_dodge(0.9),
size=2.5) +
geom_hline(yintercept = mean(CallCentreData$length), color="#fc0303") +
geom_text(aes(y= mean(CallCentreData$length), x= 1),
label= paste("Average of all time:", round(mean(CallCentreData$length))),
hjust= 0, vjust= -1, size= 3.5)+
ggtitle("Average Length Of Time Of The Call By Time") +
theme_minimal()
% of problem
CallCentreData %>%
count(problem) %>%
mutate(percentage= n/sum(n)*100) %>%
arrange(desc(n))
CallCentreData %>%
count(problem) %>%
mutate(percentage= n/sum(n)*100) %>%
mutate(percentage= round(percentage, 1)) %>%
mutate(labels= paste(problem, " (", percentage, "%)", sep = "" )) -> problemPie
library(RColorBrewer)
myPalette <- brewer.pal(5, "Set2")
pie(problemPie$n , labels = problemPie$labels, border="white", col=myPalette,
main = "Percentage Of Problem")
% of forward
CallCentreData %>%
mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%
count(forwarded) %>%
mutate(percentage= n/sum(n)*100)
CallCentreData %>%
mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%
count(forwarded) %>%
mutate(percentage= n/sum(n)*100) %>%
mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
mutate(percentage= round(percentage, 1)) %>%
ggplot(., aes(x="", y= percentage, fill= forwarded)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void(base_size = 13) + theme(legend.position = "none") +
ggtitle("Percentage Forwarded") +
geom_text(aes(y = yposition, label = paste(forwarded, "\n", n, " (", percentage, "%)", sep = "")),
color = "white", size= 4) +
scale_fill_brewer(palette="Set1")
forward by time
CallCentreData %>%
mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%
count(time, forwarded) %>% spread(forwarded, n) %>%
replace_na(list("forwarded" = 0, "not forwarded" = 0))
CallCentreData %>%
mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%
count(time, forwarded) %>%
ggplot(., aes(x= time, y= n, fill= forwarded)) +
geom_bar(stat="identity") +
theme(legend.title=element_blank()) +
ggtitle("Forwarded By Time")+
ylab("Average length of time") +
xlab("Time of the day (in hours)")
Overoll satisfaction
summary(CallCentreData$customer_satisfaction)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 7.0 9.0 8.3 10.0 10.0 42461
boxplot(CallCentreData$customer_satisfaction,
main = "Satisfaction With The Service",
xlab = "Customer satisfaction score",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
CallCentreData %>% filter(!is.na(customer_satisfaction)) %>%
mutate(satisfaction_group = ifelse(customer_satisfaction < 5, "Low",
ifelse(customer_satisfaction < 8, "Middle", "Hight")) ) %>%
count(satisfaction_group) %>%
mutate(percentage= n/sum(n)*100)
CallCentreData %>% filter(!is.na(customer_satisfaction)) %>%
mutate(satisfaction_group = ifelse(customer_satisfaction < 5, "Low",
ifelse(customer_satisfaction < 8, "Middle", "Hight")) ) %>%
count(satisfaction_group) %>%
mutate(percentage= n/sum(n)*100)
CallCentreData %>%
group_by(customer_satisfaction) %>%
count()
CallCentreData %>% count(customer_satisfaction) %>%
ggplot(. , aes(x = customer_satisfaction, y = n )) +
geom_bar( stat="identity", position=position_dodge(), alpha=0.9) +
ylab("Number of call") +
xlab("Customer satisfaction score") +
geom_text(aes(label=n),
vjust= - 0.5,
position = position_dodge(0.9),
size=2.5) +
ggtitle("Satisfaction With The Service") +
theme_minimal()
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).
satisfaction by duration
CallCentreData %>%
group_by(customer_satisfaction) %>%
summarise(mean_length = mean(length),
mean_waiting = mean(waiting))
ggplot(CallCentreData, aes(y= length, x = customer_satisfaction)) +
geom_point(aes(color = "blue"), alpha = 0.8) +
geom_smooth(formula = y~x, method="lm") +
ggtitle("Satisfaction By Duration Of The Call") +
xlab("Customer satisfaction score") +
ylab("Duration of the call") +
theme_bw(base_size = 12) + theme(legend.position = "none")
## Warning: Removed 42461 rows containing non-finite values (stat_smooth).
## Warning: Removed 42461 rows containing missing values (geom_point).
ggplot( CallCentreData, aes(y= as_factor(customer_satisfaction), fill= as_factor(customer_satisfaction), x= length)) +
geom_boxplot() +
ggtitle("Satisfaction By Duration Of The Call") +
xlab("Duration of the call") +
ylab("Customer satisfaction score") +
theme(legend.position = "none")
satisfaction by waiting
ggplot(CallCentreData, aes(y= waiting, x = customer_satisfaction)) +
geom_point(aes(color = "blue"), alpha = 0.8) +
geom_smooth(formula = y~x, method="lm") +
ggtitle("Satisfaction By Waiting Time") +
xlab("Customer satisfaction score") +
ylab("Waiting time") +
theme_bw(base_size = 12) + theme(legend.position = "none")
## Warning: Removed 42461 rows containing non-finite values (stat_smooth).
## Warning: Removed 42461 rows containing missing values (geom_point).
ggplot( CallCentreData, aes(y= as_factor(customer_satisfaction), fill= as_factor(customer_satisfaction), x= waiting)) +
geom_boxplot() +
ggtitle("Satisfaction By Waiting Time") +
xlab("Waiting time") +
ylab("Customer satisfaction score") +
theme(legend.position = "none")
satisfaction by problem
CallCentreData %>%
group_by(problem) %>%
summarise(
Min = min(customer_satisfaction, na.rm = TRUE),
"1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
Median = median(customer_satisfaction, na.rm = TRUE),
Mean = mean(customer_satisfaction, na.rm = TRUE),
"3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
Max = max(customer_satisfaction, na.rm = TRUE),
Missing = sum(is.na(customer_satisfaction))
)
ggplot( CallCentreData, aes(y= problem, fill=problem, x= customer_satisfaction)) +
geom_boxplot() +
ggtitle("Satisfaction By Problem") +
ylab("") +
xlab("Customer satisfaction score") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
## Warning: Removed 42461 rows containing non-finite values (stat_boxplot).
waiting time
summary(CallCentreData$waiting)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 29.00 52.93 90.00 561.00
boxplot(CallCentreData$waiting,
main = "Waiting time",
xlab = "Waiting time in second",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
CallCentreData %>% filter(!is.na(waiting)) %>%
mutate(waiting_group = ifelse(waiting < 60, "< 60s",
ifelse(waiting <= 120, "60s - 120s", "> 120s")) ) %>%
count(waiting_group) %>%
mutate(percentage= n/sum(n)*100)
agent performance by munber of call
CallCentreData %>%
count(agent) %>%
arrange(desc(n))
CallCentreData %>%
count(agent) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% head(10) %>%
mutate(agent = fct_reorder(agent, n)) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
geom_point( color="green", size=4) +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank()
) +
xlab("") + xlim(0, 2500) +
ylab("") + ggtitle("Top 10") -> Top10byCall
CallCentreData %>%
count(agent) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% tail(10) %>%
mutate(agent = fct_reorder(agent, n)) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
geom_point( color="red", size=4) +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank()
) +
xlab("") + xlim(0, 2500) +
ylab("") + ggtitle("Bottom 10") -> Tail10byCall
grid.arrange(Top10byCall, Tail10byCall, nrow= 1, ncol = 2,
top = "Agent Ranking By Number Of Call")
agent performance by sum duration of the call
CallCentreData %>%
group_by(agent) %>%
summarise("Sum_length" = sum(length, na.rm = TRUE)) %>%
arrange(desc(Sum_length))
CallCentreData %>%
group_by(agent) %>%
summarise(n = sum(length, na.rm = TRUE)) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
head(10) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
geom_point( color="green", size=4) +
theme_light() +
theme(
panel.border = element_blank(),
) +
xlab("") + ggtitle("Top 10") + scale_x_continuous(labels = number, limits= c(0, 800000) ) +
ylab("") -> Top10byDuration
CallCentreData %>%
group_by(agent) %>%
summarise(n = sum(length, na.rm = TRUE)) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
tail(10) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
geom_point( color="red", size=4) +
theme_light() +
theme(
panel.border = element_blank(),
) +
xlab("") + ggtitle("Bottom 10") + scale_x_continuous(labels = number, limits= c(0, 800000) ) +
ylab("") -> Tail10byDuration
grid.arrange(Top10byDuration, Tail10byDuration, nrow= 1, ncol = 2,
top = "Agent Ranking By Sum Duration Of The Call")
agent performance by satisfaction
CallCentreData %>%
group_by(agent) %>%
summarise("Mean_satisfaction" = mean(customer_satisfaction, na.rm = TRUE)) %>%
arrange(desc(Mean_satisfaction))
CallCentreData %>%
group_by(agent) %>%
summarise(n = mean(customer_satisfaction, na.rm = TRUE)) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
head(10) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
geom_point( color="green", size=4) +
theme_light() +
theme(
panel.border = element_blank(),
) +
xlab("") + ggtitle("Top 10") + scale_x_continuous(labels = number, limits= c(0, 10) ) +
ylab("") -> Top10bySatisfaction
CallCentreData %>%
group_by(agent) %>%
summarise(n = mean(customer_satisfaction, na.rm = TRUE)) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
tail(10) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
geom_point( color="red", size=4) +
theme_light() +
theme(
panel.border = element_blank(),
) +
xlab("") + ggtitle("Bottom 10") + scale_x_continuous(labels = number, limits= c(0, 10) ) +
ylab("") -> Tail10bySatisfaction
grid.arrange(Top10bySatisfaction, Tail10bySatisfaction, nrow= 1, ncol = 2,
top = "Agent Ranking By Customer Satisfaction")
agent performance by waiting time
CallCentreData %>%
group_by(agent) %>%
summarise("Mean_waiting" = mean(waiting, na.rm = TRUE)) %>%
arrange(desc(Mean_waiting))
CallCentreData %>%
group_by(agent) %>%
summarise(n = mean(waiting, na.rm = TRUE)) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, desc(n))) %>%
head(10) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
geom_point( color="red", size=4) +
theme_light() +
theme(
panel.border = element_blank(),
) +
xlab("") + ggtitle("Top 10") + scale_x_continuous(labels = number, limits= c(0, 60) ) +
ylab("") -> Tail10byWaiting
CallCentreData %>%
group_by(agent) %>%
summarise(n = mean(waiting, na.rm = TRUE)) %>%
mutate(agent = paste("Agent", agent)) %>%
arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, desc(n))) %>%
tail(10) %>%
ggplot(., aes(x = n, y = agent)) +
geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
geom_point( color="green", size=4) +
theme_light() +
theme(
panel.border = element_blank(),
) +
xlab("") + ggtitle("Bottom 10") + scale_x_continuous(labels = number, limits= c(0, 60) ) +
ylab("") -> Top10byWaiting
grid.arrange(Top10byWaiting, Tail10byWaiting, nrow= 1, ncol = 2,
top = "Agent Ranking By Waiting Time")
agent by problem
CallCentreData %>%
group_by(agent, problem) %>%
count(problem)%>%
spread(problem, n)
allocation of agent by gender
HRMData %>%
count(gender) %>%
mutate(percentage= n/sum(n)*100)
HRMData %>%
filter(!is.na(gender)) %>%
count(gender) %>%
arrange(desc(n)) %>%
mutate(percentage= n/sum(n)*100) %>%
mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
mutate(percentage= round(percentage, 1)) %>%
ggplot(., aes(x="", y= percentage, fill= gender)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void(base_size = 13) + theme(legend.position = "none") +
ggtitle("Percentage Gender") +
geom_text(aes(y = yposition, label = paste(gender, "\n", n, " (", percentage, "%)", sep = "")),
color = "white", size= 4)
agent by tenure
summary(HRMData$tenure)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 26.25 54.50 65.47 91.50 229.00
# color=group, fill=group
ggplot(HRMData, aes(x=tenure)) +
geom_histogram(aes(y=..density..), fill="#999999", bins=20, position="identity", alpha=0.7)+
geom_density(alpha=0.6, color="darkblue", fill="lightblue")+
labs(title="Distribution Of Agent Tenure",x="Number of months", y = "Density")+
theme_classic(base_size = 14) +
geom_vline(xintercept = mean(HRMData$tenure), color="#fc0303") +
geom_text(aes(y= 0, x= mean(tenure)),
label= paste("Mean:", round(mean(HRMData$tenure))),
hjust= -0.1 , vjust= 0, size= 3.5)
agent by qualification
HRMData %>%
count(qualification) %>%
arrange(desc(n))
HRMData %>%
filter(qualification != "NA") %>%
count(qualification) %>%
arrange(desc(n)) %>%
mutate(percentage= n/sum(n)*100) %>%
mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
mutate(percentage= round(percentage, 1)) %>%
ggplot(., aes(x="", y= percentage, fill= qualification)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void(base_size = 10) + theme(legend.position = "none") +
ggtitle("Percentage Of Education Degree") +
geom_text(aes(y = yposition, label = paste(qualification, "\n", n, " (", percentage, "%)", sep = "")),
color = "white", size= 4)
agent by ethnicity
HRMData %>%
count(ethnicity) %>%
arrange(desc(n))
HRMData %>%
filter(!is.na(ethnicity)) %>%
count(ethnicity) %>%
arrange(desc(n)) %>%
mutate(percentage= n/sum(n)*100) %>%
mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
mutate(percentage= round(percentage, 1)) %>%
ggplot(., aes(x="", y= percentage, fill= ethnicity)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void(base_size = 10) + theme(legend.position = "none") +
ggtitle("Percentage Of Employee’s Ethnic") +
geom_text(aes(y = yposition, label = paste(ethnicity, "\n", n, " (", percentage, "%)", sep = "")),
color = "white", size= 4)
Combine Data
CallCentreData$agent <- as.numeric(CallCentreData$agent)
HRMData$agent <- as.numeric(HRMData$agent)
CombineData <- left_join(CallCentreData, HRMData, by = "agent")
satisfaction by gender
CombineData %>%
group_by(gender) %>%
summarise(
Count = n(),
Min = min(customer_satisfaction, na.rm = TRUE),
"1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
Median = median(customer_satisfaction, na.rm = TRUE),
Mean = mean(customer_satisfaction, na.rm = TRUE),
"3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
Max = max(customer_satisfaction, na.rm = TRUE),
Missing = sum(is.na(customer_satisfaction))
)
CombineData %>% filter(!is.na(gender)) %>% filter(gender != "NA") %>%
ggplot( ., aes(y= gender, fill=gender, x= customer_satisfaction)) +
geom_boxplot() +
ggtitle("Satisfaction By Sex") +
ylab("Sex") +
xlab("Satisfaction") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
## Warning: Removed 41385 rows containing non-finite values (stat_boxplot).
t.test(data = CombineData, customer_satisfaction ~ gender)
##
## Welch Two Sample t-test
##
## data: customer_satisfaction by gender
## t = 1.4612, df = 57281, p-value = 0.144
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.005965469 0.040919404
## sample estimates:
## mean in group female mean in group male
## 8.315402 8.297925
Duration by gender
CombineData %>%
group_by(gender) %>%
summarise(
Count = n(),
Min = min(length, na.rm = TRUE),
"1st Quantile" = quantile(length, probs = c(0.25), na.rm = TRUE),
Median = median(length, na.rm = TRUE),
Mean = mean(length, na.rm = TRUE),
"3st Quantile" = quantile(length, probs = c(0.75), na.rm = TRUE),
Max = max(length, na.rm = TRUE),
Missing = sum(is.na(length))
)
CombineData %>% filter(!is.na(gender)) %>% filter(gender != "NA") %>%
ggplot( ., aes(y= gender, fill=gender, x= length)) +
geom_boxplot() +
ggtitle("Duration By Sex") +
ylab("Sex") +
xlab("Duration") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
t.test(data = CombineData, length ~ gender)
##
## Welch Two Sample t-test
##
## data: length by gender
## t = 1.0982, df = 79315, p-value = 0.2721
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9074283 3.2201456
## sample estimates:
## mean in group female mean in group male
## 312.0080 310.8516
Waiting time by gender
CombineData %>%
group_by(gender) %>%
summarise(
Count = n(),
Min = min(waiting, na.rm = TRUE),
"1st Quantile" = quantile(waiting, probs = c(0.25), na.rm = TRUE),
Median = median(waiting, na.rm = TRUE),
Mean = mean(waiting, na.rm = TRUE),
"3st Quantile" = quantile(waiting, probs = c(0.75), na.rm = TRUE),
Max = max(waiting, na.rm = TRUE),
Missing = sum(is.na(waiting))
)
CombineData %>% filter(!is.na(gender)) %>% filter(gender != "NA") %>%
ggplot( ., aes(y= gender, fill=gender, x= customer_satisfaction)) +
geom_boxplot() +
ggtitle("Waiting Time By Sex") +
ylab("Sex") +
xlab("Waiting time") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
## Warning: Removed 41385 rows containing non-finite values (stat_boxplot).
t.test(data = CombineData, waiting ~ gender)
##
## Welch Two Sample t-test
##
## data: waiting by gender
## t = 1.1163, df = 78806, p-value = 0.2643
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3102183 1.1310442
## sample estimates:
## mean in group female mean in group male
## 53.30955 52.89913
satisfaction by qualification
CombineData %>%
group_by(qualification) %>%
summarise(
Count = n(),
Min = min(customer_satisfaction, na.rm = TRUE),
"1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
Median = median(customer_satisfaction, na.rm = TRUE),
Mean = mean(customer_satisfaction, na.rm = TRUE),
"3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
Max = max(customer_satisfaction, na.rm = TRUE),
Missing = sum(is.na(customer_satisfaction))
)
CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
ggplot( ., aes(y= qualification, fill=qualification, x= customer_satisfaction)) +
geom_boxplot() +
ggtitle("Satisfaction By Qualification") +
ylab("Qualification") +
xlab("Satisfaction") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
## Warning: Removed 40355 rows containing non-finite values (stat_boxplot).
CombineData %>% filter(qualification != "NA") %>%
aov(data = ., customer_satisfaction ~ qualification) %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## qualification 2 2 1.156 0.369 0.692
## Residuals 103020 323246 3.138
## 40355 observations deleted due to missingness
Duration by qualification
CombineData %>%
group_by(qualification) %>%
summarise(
Count = n(),
Min = min(length, na.rm = TRUE),
"1st Quantile" = quantile(length, probs = c(0.25), na.rm = TRUE),
Median = median(length, na.rm = TRUE),
Mean = mean(length, na.rm = TRUE),
"3st Quantile" = quantile(length, probs = c(0.75), na.rm = TRUE),
Max = max(length, na.rm = TRUE),
Missing = sum(is.na(length))
)
CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
ggplot( ., aes(y= qualification, fill=qualification, x= length)) +
geom_boxplot() +
ggtitle("Duration By Qualification") +
ylab("Qualification") +
xlab("Duration") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
CombineData %>% filter(qualification != "NA") %>%
aov(data = ., length ~ qualification) %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## qualification 2 55555 27777 0.825 0.438
## Residuals 143375 4824590339 33650
Waiting time by qualification
CombineData %>%
group_by(qualification) %>%
summarise(
Count = n(),
Min = min(waiting, na.rm = TRUE),
"1st Quantile" = quantile(waiting, probs = c(0.25), na.rm = TRUE),
Median = median(waiting, na.rm = TRUE),
Mean = mean(waiting, na.rm = TRUE),
"3st Quantile" = quantile(waiting, probs = c(0.75), na.rm = TRUE),
Max = max(waiting, na.rm = TRUE),
Missing = sum(is.na(waiting))
)
CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
ggplot( ., aes(y= qualification, fill=qualification, x= waiting)) +
geom_boxplot() +
ggtitle("Waiting Time By Qualification") +
ylab("Qualification") +
xlab("Waiting time") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
ggplot( ., aes(y= qualification, fill=qualification, x= waiting)) +
geom_boxplot() +
ggtitle("Waiting Time By Qualification") +
ylab("Qualification") +
xlab("Waiting time") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
CombineData %>% filter(qualification != "NA") %>%
aov(data = ., waiting ~ qualification) %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## qualification 2 23169 11585 2.84 0.0584 .
## Residuals 143375 584826332 4079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
satisfaction by ethnicity
CombineData %>%
group_by(ethnicity) %>%
summarise(
Count = n(),
Min = min(customer_satisfaction, na.rm = TRUE),
"1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
Median = median(customer_satisfaction, na.rm = TRUE),
Mean = mean(customer_satisfaction, na.rm = TRUE),
"3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
Max = max(customer_satisfaction, na.rm = TRUE),
Missing = sum(is.na(customer_satisfaction))
)
CombineData %>% filter(!is.na(ethnicity)) %>%
ggplot( ., aes(y= ethnicity, fill=ethnicity, x= length)) +
geom_boxplot() +
ggtitle("Satisfaction By Ethnicity") +
ylab("Ethnicity") +
xlab("Satisfaction") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
aov(data = CombineData, customer_satisfaction ~ ethnicity) %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## ethnicity 3 3 0.9246 0.295 0.829
## Residuals 103484 324615 3.1369
## 47449 observations deleted due to missingness
Duration by ethnicity
CombineData %>%
group_by(ethnicity) %>%
summarise(
Count = n(),
Min = min(length, na.rm = TRUE),
"1st Quantile" = quantile(length, probs = c(0.25), na.rm = TRUE),
Median = median(length, na.rm = TRUE),
Mean = mean(length, na.rm = TRUE),
"3st Quantile" = quantile(length, probs = c(0.75), na.rm = TRUE),
Max = max(length, na.rm = TRUE),
Missing = sum(is.na(length))
)
CombineData %>% filter(!is.na(ethnicity)) %>%
ggplot( ., aes(y= ethnicity, fill=ethnicity, x= length)) +
geom_boxplot() +
ggtitle("Duration By Ethnicity") +
ylab("Ethnicity") +
xlab("Duration") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
aov(data = CombineData, length ~ ethnicity) %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## ethnicity 3 6373 2124 0.063 0.979
## Residuals 143855 4830335219 33578
## 7078 observations deleted due to missingness
Waiting time by ethnicity
CombineData %>%
group_by(ethnicity) %>%
summarise(
Count = n(),
Min = min(waiting, na.rm = TRUE),
"1st Quantile" = quantile(waiting, probs = c(0.25), na.rm = TRUE),
Median = median(waiting, na.rm = TRUE),
Mean = mean(waiting, na.rm = TRUE),
"3st Quantile" = quantile(waiting, probs = c(0.75), na.rm = TRUE),
Max = max(waiting, na.rm = TRUE),
Missing = sum(is.na(waiting))
)
CombineData %>% filter(!is.na(ethnicity)) %>%
ggplot( ., aes(y= ethnicity, fill=ethnicity, x= waiting)) +
geom_boxplot() +
ggtitle("Waiting Time By Ethnicity") +
ylab("Ethnicity") +
xlab("Waiting time") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank())
aov(data = CombineData, waiting ~ ethnicity) %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## ethnicity 3 21006 7002 1.719 0.161
## Residuals 143855 585932514 4073
## 7078 observations deleted due to missingness
library(psych)
options(repr.plot.width = 12, repr.plot.height = 12)
pairs.panels(CombineData[,c("length", "waiting", "customer_satisfaction", "age", "tenure")],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)